Importacion de textos.

##### importar datos
# Carga de librerías necesarias
suppressMessages(suppressWarnings(library(readr)))  # Lectura de archivos CSV
suppressMessages(suppressWarnings(library(tidyverse)))  # Conjunto de paquetes para manipulación de datos
# warnings debido a caracteres no UTF-8 o vacios ("")
# UTF-8 (8-bit Unicode Transformation Format) es un formato de codificación de caracteres 
# capaz de codificar todos los code points validos en Unicode
# Importar los textos de las conferencias
text_1997 <- read_csv("AppleWWDC1997_es.txt", col_names = FALSE, show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2001 <- read_csv("AppleWWDC2001_es.txt", col_names = FALSE, show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2005 <- read_csv("AppleWWDC2005_es.txt", col_names = FALSE, show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2008 <- read_csv("AppleWWDC2008_es.txt", col_names = FALSE, show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2010 <- read_csv("AppleWWDC2010_es.txt", col_names = FALSE, show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
class(text_1997)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
text_1997 <- c(text_1997)
class(text_1997)
## [1] "list"
text_1997 <- unlist(text_1997)
class(text_1997)
## [1] "character"
names(text_1997) <- NULL  # importante!
head(text_1997, n = 3)
## [1] "Buenos días"                          
## [2] "Ambos llevaban corbata toda la semana"
## [3] "Noticias"
text_2001 <- unlist(c(read_csv("AppleWWDC2001_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2005 <- unlist(c(read_csv("AppleWWDC2005_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2008 <- unlist(c(read_csv("AppleWWDC2008_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2010 <- unlist(c(read_csv("AppleWWDC2010_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
names(text_2001) <- NULL
names(text_2005) <- NULL
names(text_2008) <- NULL
names(text_2010) <- NULL
##### data frame formato tidy

text_1997 <- tibble(line = 1:length(text_1997), text = text_1997)  # tibble en lugar de data_frame
class(text_1997)
## [1] "tbl_df"     "tbl"        "data.frame"
dim(text_1997)
## [1] 1322    2
head(text_1997, n = 3)
## # A tibble: 3 × 2
##    line text                                 
##   <int> <chr>                                
## 1     1 Buenos días                          
## 2     2 Ambos llevaban corbata toda la semana
## 3     3 Noticias
# texto no normalizado
# no tiene "estructura" para analizar

text_2001 <- tibble(line = 1:length(text_2001), text = text_2001)
text_2005 <- tibble(line = 1:length(text_2005), text = text_2005)
text_2008 <- tibble(line = 1:length(text_2008), text = text_2008)
text_2010 <- tibble(line = 1:length(text_2010), text = text_2010)

#Tokenizacion

suppressMessages(suppressWarnings(library(tidytext)))
suppressMessages(suppressWarnings(library(magrittr)))
##### tokenizacion formato tidy

text_1997 %<>%
  unnest_tokens(input = text, output = word) %>%
  filter(!is.na(word))  # importante!
class(text_1997)
## [1] "tbl_df"     "tbl"        "data.frame"
dim(text_1997)
## [1] 11356     2
head(text_1997, n = 10)
## # A tibble: 10 × 2
##     line word    
##    <int> <chr>   
##  1     1 buenos  
##  2     1 días    
##  3     2 ambos   
##  4     2 llevaban
##  5     2 corbata 
##  6     2 toda    
##  7     2 la      
##  8     2 semana  
##  9     3 noticias
## 10     4 tú
text_2001 %<>%
  unnest_tokens(input = text, output = word) %>%
  filter(!is.na(word))
dim(text_2001)
## [1] 15099     2
text_2005 %<>%
  unnest_tokens(input = text, output = word) %>%
  filter(!is.na(word))
dim(text_2005)
## [1] 8102    2
text_2008 %<>%
  unnest_tokens(input = text, output = word) %>%
  filter(!is.na(word))
dim(text_2008)
## [1] 14515     2
text_2010 %<>%
  unnest_tokens(input = text, output = word) %>%
  filter(!is.na(word))
dim(text_2010)
## [1] 6412    2
head(text_2001, n = 10)
## # A tibble: 10 × 2
##     line word     
##    <int> <chr>    
##  1     1 buenos   
##  2     1 días     
##  3     2 estamos  
##  4     2 muy      
##  5     2 contentos
##  6     2 de       
##  7     2 estar    
##  8     2 aquí     
##  9     2 en       
## 10     2 nueva
head(text_2005, n = 10)
## # A tibble: 10 × 2
##     line word           
##    <int> <chr>          
##  1     1 bienvenidos    
##  2     1 a              
##  3     1 nuestra        
##  4     1 conferencia    
##  5     1 mundial        
##  6     1 de             
##  7     1 desarrolladores
##  8     1 2005           
##  9     1 hoy            
## 10     1 es
head(text_2008, n = 10)
## # A tibble: 10 × 2
##     line word    
##    <int> <chr>   
##  1     1 estoy   
##  2     1 muy     
##  3     1 contento
##  4     1 de      
##  5     1 estar   
##  6     1 aquí    
##  7     1 esta    
##  8     1 vez     
##  9     2 buenos  
## 10     2 días
head(text_2010, n = 10)
## # A tibble: 10 × 2
##     line word     
##    <int> <chr>    
##  1     1 así      
##  2     1 que      
##  3     1 volvamos 
##  4     1 al       
##  5     1 iphone   
##  6     2 en       
##  7     2 2007     
##  8     2 el       
##  9     2 iphone   
## 10     2 reinventó

#Nomrmalizacion de texto

##### texto con numeros?

text_1997 %>%
  filter(grepl(pattern = '[0-9]', x = word)) %>% 
  count(word, sort = TRUE)
## # A tibble: 35 × 2
##    word      n
##    <chr> <int>
##  1 10       13
##  2 18        6
##  3 20        4
##  4 100       3
##  5 30        3
##  6 5         3
##  7 500       3
##  8 12        2
##  9 14        2
## 10 15        2
## # ℹ 25 more rows
text_2001 %>%
  filter(grepl(pattern = '[0-9]', x = word)) %>% 
  count(word, sort = TRUE)
## # A tibble: 95 × 2
##    word      n
##    <chr> <int>
##  1 10      102
##  2 g4       33
##  3 4        14
##  4 1        13
##  5 3        13
##  6 867      13
##  7 os10     13
##  8 3d       12
##  9 7        12
## 10 99       10
## # ℹ 85 more rows
text_2005 %>%
  filter(grepl(pattern = '[0-9]', x = word)) %>% 
  count(word, sort = TRUE)
## # A tibble: 58 × 2
##    word      n
##    <chr> <int>
##  1 10       28
##  2 20        8
##  3 2.1       6
##  4 2         5
##  5 2006      4
##  6 400       4
##  7 264       3
##  8 500       3
##  9 7         3
## 10 9         3
## # ℹ 48 more rows
text_2008 %>%
  filter(grepl(pattern = '[0-9]', x = word)) %>% 
  count(word, sort = TRUE)
## # A tibble: 72 × 2
##    word      n
##    <chr> <int>
##  1 3g       33
##  2 2.0      21
##  3 10       20
##  4 70        5
##  5 100       4
##  6 11        4
##  7 199       4
##  8 20        4
##  9 3d        4
## 10 5         4
## # ℹ 62 more rows
text_2010 %>%
  filter(grepl(pattern = '[0-9]', x = word)) %>% 
  count(word, sort = TRUE)
## # A tibble: 43 × 2
##    word      n
##    <chr> <int>
##  1 4        42
##  2 3gs      11
##  3 a4        9
##  4 3g        8
##  5 199       5
##  6 2010      5
##  7 2007      4
##  8 24        4
##  9 30        4
## 10 720p      4
## # ℹ 33 more rows
##### remover texto con numeros

text_1997 %<>%
  filter(!grepl(pattern = '[0-9]', x = word))
dim(text_1997)
## [1] 11285     2
text_2001 %<>%
  filter(!grepl(pattern = '[0-9]', x = word))
dim(text_2001)
## [1] 14670     2
text_2005 %<>%
  filter(!grepl(pattern = '[0-9]', x = word))
dim(text_2005)
## [1] 7977    2
text_2008 %<>%
  filter(!grepl(pattern = '[0-9]', x = word))
dim(text_2008)
## [1] 14323     2
text_2010 %<>%
  filter(!grepl(pattern = '[0-9]', x = word))
dim(text_2010)
## [1] 6262    2
dim(stop_words)
## [1] 1149    2
head(stop_words, n = 10)
## # A tibble: 10 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 a           SMART  
##  2 a's         SMART  
##  3 able        SMART  
##  4 about       SMART  
##  5 above       SMART  
##  6 according   SMART  
##  7 accordingly SMART  
##  8 across      SMART  
##  9 actually    SMART  
## 10 after       SMART
table(stop_words$lexicon)
## 
##     onix    SMART snowball 
##      404      571      174
###### stop words 
# no hay diccionarios en español disponibles en tidytext
# diccionario COUNTWORDSFREE en español (con acentos)
# http://countwordsfree.com/stopwords/spanish
# otras alternativas:
#   https://github.com/stopwords-iso/stopwords-es
#   de tm::stopwords("spanish")
# se conserva el mismo formato de los diccionarios en tidytext
stop_words_es <- tibble(word = unlist(c(read.table("stop_words_spanish.txt", quote="\"", comment.char=""))), lexicon = "custom")
dim(stop_words_es)
## [1] 444   2
head(stop_words_es, n = 10)
## # A tibble: 10 × 2
##    word      lexicon
##    <chr>     <chr>  
##  1 algún     custom 
##  2 alguna    custom 
##  3 algunas   custom 
##  4 alguno    custom 
##  5 algunos   custom 
##  6 ambos     custom 
##  7 ampleamos custom 
##  8 ante      custom 
##  9 antes     custom 
## 10 aquel     custom
##### remover stop words

text_1997 %<>% 
  anti_join(x = ., y = stop_words_es)
## Joining with `by = join_by(word)`
dim(text_1997)
## [1] 4009    2
head(text_1997, n = 10)
## # A tibble: 10 × 2
##     line word    
##    <int> <chr>   
##  1     1 días    
##  2     2 llevaban
##  3     2 corbata 
##  4     2 semana  
##  5     3 noticias
##  6     4 tú      
##  7     5 corto   
##  8     5 haré    
##  9     5 uh      
## 10     5 sexto
text_2001 %<>% 
  anti_join(x = ., y = stop_words_es)
## Joining with `by = join_by(word)`
dim(text_2001)
## [1] 5957    2
head(text_2001, n = 10)
## # A tibble: 10 × 2
##     line word     
##    <int> <chr>    
##  1     1 días     
##  2     2 contentos
##  3     2 york     
##  4     2 geniales 
##  5     2 compartir
##  6     3 ustedes  
##  7     3 mañana   
##  8     4 tiendas  
##  9     5 tiendas  
## 10     5 tyson's
text_2005 %<>% 
  anti_join(x = ., y = stop_words_es)
## Joining with `by = join_by(word)`
dim(text_2005)
## [1] 3080    2
head(text_2005, n = 10)
## # A tibble: 10 × 2
##     line word           
##    <int> <chr>          
##  1     1 bienvenidos    
##  2     1 conferencia    
##  3     1 mundial        
##  4     1 desarrolladores
##  5     1 día            
##  6     1 importante     
##  7     2 geniales       
##  8     2 ti             
##  9     2 quiero         
## 10     2 comenzar
text_2008 %<>% 
  anti_join(x = ., y = stop_words_es)
## Joining with `by = join_by(word)`
dim(text_2008)
## [1] 5820    2
head(text_2008, n = 10)
## # A tibble: 10 × 2
##     line word      
##    <int> <chr>     
##  1     1 contento  
##  2     2 días      
##  3     2 trabajando
##  4     2 duro      
##  5     2 geniales  
##  6     2 ansiosos  
##  7     2 compartir 
##  8     2 ustedes   
##  9     2 gracias   
## 10     2 venir
text_2010 %<>% 
  anti_join(x = ., y = stop_words_es)
## Joining with `by = join_by(word)`
dim(text_2010)
## [1] 2421    2
head(text_2010, n = 10)
## # A tibble: 10 × 2
##     line word        
##    <int> <chr>       
##  1     1 volvamos    
##  2     1 iphone      
##  3     2 iphone      
##  4     2 reinventó   
##  5     2 consideramos
##  6     2 teléfono    
##  7     3 difícil     
##  8     3 recordar    
##  9     3 operadores  
## 10     3 iphone
##### remover acentos
replacement_list <- list('á' = 'a', 'é' = 'e', 'í' = 'i', 'ó' = 'o', 'ú' = 'u')

text_1997 %<>% 
  mutate(word = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word))
dim(text_1997)
## [1] 4009    2
head(text_1997, n = 10)
## # A tibble: 10 × 2
##     line word    
##    <int> <chr>   
##  1     1 dias    
##  2     2 llevaban
##  3     2 corbata 
##  4     2 semana  
##  5     3 noticias
##  6     4 tu      
##  7     5 corto   
##  8     5 hare    
##  9     5 uh      
## 10     5 sexto
text_2001 %<>% 
  mutate(word = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word))
dim(text_2001)
## [1] 5957    2
head(text_2001, n = 10)
## # A tibble: 10 × 2
##     line word     
##    <int> <chr>    
##  1     1 dias     
##  2     2 contentos
##  3     2 york     
##  4     2 geniales 
##  5     2 compartir
##  6     3 ustedes  
##  7     3 mañana   
##  8     4 tiendas  
##  9     5 tiendas  
## 10     5 tyson's
text_2005 %<>% 
  mutate(word = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word))
dim(text_2005)
## [1] 3080    2
head(text_2005, n = 10)
## # A tibble: 10 × 2
##     line word           
##    <int> <chr>          
##  1     1 bienvenidos    
##  2     1 conferencia    
##  3     1 mundial        
##  4     1 desarrolladores
##  5     1 dia            
##  6     1 importante     
##  7     2 geniales       
##  8     2 ti             
##  9     2 quiero         
## 10     2 comenzar
text_2008 %<>% 
  mutate(word = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word))
dim(text_2008)
## [1] 5820    2
head(text_2008, n = 10)
## # A tibble: 10 × 2
##     line word      
##    <int> <chr>     
##  1     1 contento  
##  2     2 dias      
##  3     2 trabajando
##  4     2 duro      
##  5     2 geniales  
##  6     2 ansiosos  
##  7     2 compartir 
##  8     2 ustedes   
##  9     2 gracias   
## 10     2 venir
text_2010 %<>% 
  mutate(word = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word))
dim(text_2010)
## [1] 2421    2
head(text_2010, n = 10)
## # A tibble: 10 × 2
##     line word        
##    <int> <chr>       
##  1     1 volvamos    
##  2     1 iphone      
##  3     2 iphone      
##  4     2 reinvento   
##  5     2 consideramos
##  6     2 telefono    
##  7     3 dificil     
##  8     3 recordar    
##  9     3 operadores  
## 10     3 iphone

#Tokens mas frecuentes

##### top 10 de tokens mas frecuentes

text_1997 %>% 
  count(word, sort = TRUE) %>%
  head(n = 10)
## # A tibble: 10 × 2
##    word          n
##    <chr>     <int>
##  1 apple       107
##  2 realmente    59
##  3 um           34
##  4 eh           32
##  5 personas     31
##  6 software     31
##  7 hardware     29
##  8 quiero       28
##  9 gente        26
## 10 mundo        23
text_2001 %>% 
  count(word, sort = TRUE)  %>%
  head(n = 10)
## # A tibble: 10 × 2
##    word             n
##    <chr>        <int>
##  1 mac             91
##  2 os              81
##  3 puedes          67
##  4 gracias         46
##  5 rapido          37
##  6 aplicaciones    34
##  7 realmente       30
##  8 apple           28
##  9 sistema         27
## 10 te              24
text_2005 %>% 
  count(word, sort = TRUE)  %>%
  head(n = 10)
## # A tibble: 10 × 2
##    word             n
##    <chr>        <int>
##  1 apple           43
##  2 intel           33
##  3 mac             29
##  4 aplicaciones    26
##  5 años            24
##  6 powerpc         24
##  7 xcode           24
##  8 os              23
##  9 transicion      22
## 10 procesadores    18
text_2008 %>% 
  count(word, sort = TRUE)  %>%
  head(n = 10)
## # A tibble: 10 × 2
##    word             n
##    <chr>        <int>
##  1 iphone         166
##  2 aplicacion      71
##  3 aplicaciones    55
##  4 realmente       45
##  5 telefono        36
##  6 correo          34
##  7 puedes          34
##  8 sdk             34
##  9 directamente    32
## 10 juego           32
text_2010 %>% 
  count(word, sort = TRUE)  %>%
  head(n = 10)
## # A tibble: 10 × 2
##    word             n
##    <chr>        <int>
##  1 iphone          70
##  2 pantalla        29
##  3 telefono        26
##  4 realmente       25
##  5 pixeles         24
##  6 puedes          23
##  7 video           21
##  8 camara          20
##  9 tu              18
## 10 aplicaciones    16
##### viz
suppressMessages(suppressWarnings(library(gridExtra)))


p1 <- text_1997 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 10 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'darkolivegreen4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '1997: Conteo de palabras')


p2 <- text_2001 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 10 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'blue4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '2001: Conteo de palabras')

# Desplegar gráfico
grid.arrange(p1, p2, ncol = 2)

##### viz
suppressMessages(suppressWarnings(library(gridExtra)))


p1 <- text_1997 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 10 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'darkolivegreen4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '1997: Conteo de palabras')


p2 <- text_2005 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 10 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'blue4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '2005: Conteo de palabras')

# Desplegar gráficos
grid.arrange(p1, p2, ncol = 2)

##### viz
suppressMessages(suppressWarnings(library(gridExtra)))


p1 <- text_1997 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 20 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'darkolivegreen4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '1997: Conteo de palabras')


p2 <- text_2008 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 20 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'blue4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '2008: Conteo de palabras')

# Desplegar gráficos
grid.arrange(p1, p2, ncol = 2)

##### viz
suppressMessages(suppressWarnings(library(gridExtra)))


p1 <- text_1997 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 20 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'darkolivegreen4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '1997: Conteo de palabras')


p2 <- text_2010 %>%
  count(word, sort = TRUE) %>%
  slice_max(order_by = n, n = 20) %>%  # Mostrar solo las 20 palabras más frecuentes
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
    theme_light() + 
    geom_col(fill = 'blue4', alpha = 0.8) +
    xlab(NULL) +
    ylab("Frecuencia") +
    coord_flip() +
    ggtitle(label = '2010: Conteo de palabras')

# Desplegar gráficos
grid.arrange(p1, p2, ncol = 2)

suppressMessages(suppressWarnings(library(wordcloud)))
###### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

set.seed(123)
text_1997 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'darkolivegreen4'))
title(main = "1997")

set.seed(123)
text_2001 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'blue4'))
title(main = "2001")

suppressMessages(suppressWarnings(library(wordcloud)))
###### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

set.seed(124)
text_1997 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'darkolivegreen4'))
title(main = "1997")

set.seed(124)
text_2005 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'blue4'))
## Warning in wordcloud(words = word, freq = n, max.words = 20, colors = "blue4"):
## apple could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word, freq = n, max.words = 20, colors = "blue4"):
## transicion could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word, freq = n, max.words = 20, colors = "blue4"):
## aplicaciones could not be fit on page. It will not be plotted.
title(main = "2005")

suppressMessages(suppressWarnings(library(wordcloud)))
###### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

set.seed(123)
text_1997 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'darkolivegreen4'))
title(main = "1997")

set.seed(123)
text_2008 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'blue4'))
title(main = "2008")

suppressMessages(suppressWarnings(library(wordcloud)))
###### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

set.seed(123)
text_1997 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'darkolivegreen4'))
title(main = "1997")

set.seed(123)
text_2010 %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(words = word, freq = n, max.words = 20, colors = 'blue4'))
title(main = "2010")

##### frecuencias relativas de la palabras
bind_rows(mutate(.data = text_1997, author = "1997"),
                       mutate(.data = text_2001, author = "2001")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n/sum(n)) %>%
  select(-n) %>%
  spread(author, proportion, fill = 0) -> frec  # importante!
frec %<>% 
  select(word, "1997", "2001")
dim(frec)
## [1] 3479    3
head(frec, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2001`
##    <chr>          <dbl>    <dbl>
##  1 abajo       0.000499 0.000336
##  2 abarca      0.000249 0       
##  3 abdominales 0.000249 0       
##  4 abiertas    0.000249 0.000168
##  5 abierto     0.000998 0.000168
##  6 abiertos    0        0.000168
##  7 abogando    0.000249 0       
##  8 aborda      0.000249 0       
##  9 abordar     0.000249 0       
## 10 abramos     0        0.000168
##### top 10 palabras en comun
# orden anidado respecto a petro y duque
frec %>%
  filter(1997 !=0, 2001 != 0) %>%
  arrange(desc(1997), desc(2001)) -> frec_comun
dim(frec_comun)
## [1] 3479    3
head(frec_comun, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2001`
##    <chr>          <dbl>    <dbl>
##  1 abajo       0.000499 0.000336
##  2 abarca      0.000249 0       
##  3 abdominales 0.000249 0       
##  4 abiertas    0.000249 0.000168
##  5 abierto     0.000998 0.000168
##  6 abiertos    0        0.000168
##  7 abogando    0.000249 0       
##  8 aborda      0.000249 0       
##  9 abordar     0.000249 0       
## 10 abramos     0        0.000168
###### proporcion palabras en comun
dim(frec_comun)[1]/dim(frec)[1]
## [1] 1
##### frecuencias relativas de la palabras
bind_rows(mutate(.data = text_1997, author = "1997"),
                       mutate(.data = text_2005, author = "2005")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n/sum(n)) %>%
  select(-n) %>%
  spread(author, proportion, fill = 0) -> frec  # importante!
frec %<>% 
  select(word, "1997", "2005")
dim(frec)
## [1] 2705    3
head(frec, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2005`
##    <chr>          <dbl>    <dbl>
##  1 abajo       0.000499 0       
##  2 abarca      0.000249 0       
##  3 abarrotada  0        0.000325
##  4 abdominales 0.000249 0       
##  5 abiertas    0.000249 0       
##  6 abierto     0.000998 0.000974
##  7 abiertos    0        0.000649
##  8 abogando    0.000249 0       
##  9 aborda      0.000249 0       
## 10 abordar     0.000249 0
##### top 10 palabras en comun
# orden anidado respecto a petro y duque
frec %>%
  filter(1997 !=0, 2005 != 0) %>%
  arrange(desc(1997), desc(2005)) -> frec_comun1
dim(frec_comun1)
## [1] 2705    3
head(frec_comun1, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2005`
##    <chr>          <dbl>    <dbl>
##  1 abajo       0.000499 0       
##  2 abarca      0.000249 0       
##  3 abarrotada  0        0.000325
##  4 abdominales 0.000249 0       
##  5 abiertas    0.000249 0       
##  6 abierto     0.000998 0.000974
##  7 abiertos    0        0.000649
##  8 abogando    0.000249 0       
##  9 aborda      0.000249 0       
## 10 abordar     0.000249 0
###### proporcion palabras en comun
dim(frec_comun1)[1]/dim(frec)[1]
## [1] 1
##### frecuencias relativas de la palabras
bind_rows(mutate(.data = text_1997, author = "1997"),
                       mutate(.data = text_2008, author = "2008")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n/sum(n)) %>%
  select(-n) %>%
  spread(author, proportion, fill = 0) -> frec2  # importante!
frec2 %<>% 
  select(word, "1997", "2008")
dim(frec2)
## [1] 3395    3
head(frec2, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2008`
##    <chr>          <dbl>    <dbl>
##  1 aaron       0        0.000172
##  2 abajo       0.000499 0.000687
##  3 abandona    0        0.000172
##  4 abandonado  0        0.000172
##  5 abarca      0.000249 0       
##  6 abdominales 0.000249 0       
##  7 abiertas    0.000249 0       
##  8 abierto     0.000998 0.000172
##  9 abiertos    0        0.000172
## 10 abogados    0        0.000172
##### top 10 palabras en comun

frec2 %>%
  filter(1997 !=0, 2008 != 0) %>%
  arrange(desc(1997), desc(2008)) -> frec_comun2
dim(frec_comun2)
## [1] 3395    3
head(frec_comun2, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2008`
##    <chr>          <dbl>    <dbl>
##  1 aaron       0        0.000172
##  2 abajo       0.000499 0.000687
##  3 abandona    0        0.000172
##  4 abandonado  0        0.000172
##  5 abarca      0.000249 0       
##  6 abdominales 0.000249 0       
##  7 abiertas    0.000249 0       
##  8 abierto     0.000998 0.000172
##  9 abiertos    0        0.000172
## 10 abogados    0        0.000172
###### proporcion palabras en comun
dim(frec_comun2)[1]/dim(frec2)[1]
## [1] 1
##### frecuencias relativas de la palabras
bind_rows(mutate(.data = text_1997, author = "1997"),
                       mutate(.data = text_2010, author = "2010")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n/sum(n)) %>%
  select(-n) %>%
  spread(author, proportion, fill = 0) -> frec3  # importante!
frec3 %<>% 
  select(word, "1997", "2010")
dim(frec3)
## [1] 2542    3
head(frec3, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2010`
##    <chr>          <dbl>    <dbl>
##  1 aac         0        0.000413
##  2 abajo       0.000499 0.00165 
##  3 abarca      0.000249 0       
##  4 abdominales 0.000249 0       
##  5 abiertas    0.000249 0.000413
##  6 abierto     0.000998 0.000413
##  7 abiertos    0        0.000413
##  8 abogando    0.000249 0       
##  9 aborda      0.000249 0       
## 10 abordar     0.000249 0
##### top 10 palabras en comun

frec3 %>%
  filter(1997 !=0, 2010 != 0) %>%
  arrange(desc(1997), desc(2010)) -> frec_comun3
dim(frec_comun3)
## [1] 2542    3
head(frec_comun3, n = 10)
## # A tibble: 10 × 3
##    word          `1997`   `2010`
##    <chr>          <dbl>    <dbl>
##  1 aac         0        0.000413
##  2 abajo       0.000499 0.00165 
##  3 abarca      0.000249 0       
##  4 abdominales 0.000249 0       
##  5 abiertas    0.000249 0.000413
##  6 abierto     0.000998 0.000413
##  7 abiertos    0        0.000413
##  8 abogando    0.000249 0       
##  9 aborda      0.000249 0       
## 10 abordar     0.000249 0
###### proporcion palabras en comun
dim(frec_comun3)[1]/dim(frec3)[1]
## [1] 1
##### Asignar frecuencias relativas a las variables correspondientes
frec <- bind_rows(
  mutate(text_1997, author = "freq_1997"),
  mutate(text_2001, author = "freq_2001"),
  mutate(text_2005, author = "freq_2005"),
  mutate(text_2008, author = "freq_2008"),
  mutate(text_2010, author = "freq_2010")
) %>%
  # Calcular frecuencias relativas
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>%
  select(-n) %>%
  spread(author, proportion, fill = 0)  # Crear columnas separadas por autor

# Mostrar las dimensiones para verificar
dim(frec)
## [1] 5644    6
# Verificar las primeras filas
head(frec)
## # A tibble: 6 × 6
##   word       freq_1997 freq_2001 freq_2005 freq_2008 freq_2010
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 aac         0         0                0  0         0.000413
## 2 aaron       0         0                0  0.000172  0       
## 3 abajo       0.000499  0.000336         0  0.000687  0.00165 
## 4 abandona    0         0                0  0.000172  0       
## 5 abandonado  0         0                0  0.000172  0       
## 6 abarca      0.000249  0                0  0         0
cor.test(x = frec$freq_1997, y = frec$freq_2001)
## 
##  Pearson's product-moment correlation
## 
## data:  frec$freq_1997 and frec$freq_2001
## t = 29.867, df = 5642, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3467456 0.3918056
## sample estimates:
##       cor 
## 0.3694928
cor.test(x = frec$freq_1997, y = frec$freq_2005)
## 
##  Pearson's product-moment correlation
## 
## data:  frec$freq_1997 and frec$freq_2005
## t = 41.369, df = 5642, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4621531 0.5021951
## sample estimates:
##       cor 
## 0.4824261
cor.test(x = frec$freq_1997, y = frec$freq_2008)
## 
##  Pearson's product-moment correlation
## 
## data:  frec$freq_1997 and frec$freq_2008
## t = 20.806, df = 5642, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2425473 0.2910109
## sample estimates:
##       cor 
## 0.2669479
cor.test(x = frec$freq_1997, y = frec$freq_2010)
## 
##  Pearson's product-moment correlation
## 
## data:  frec$freq_1997 and frec$freq_2010
## t = 16.712, df = 5642, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1921831 0.2419032
## sample estimates:
##      cor 
## 0.217184
# Verifica los nombres de las columnas
names(frec_comun)
## [1] "word" "1997" "2001"
# Si las columnas son diferentes, renómbralas
colnames(frec_comun) <- c("word", "freq_1997", "freq_2001", "freq_2005", "freq_2008", "freq_2010")

# Filtra las filas con valores no NA
frec_comun <- frec_comun %>%
  filter(!is.na(freq_1997) & !is.na(freq_2001))

# Realiza el análisis de correlación
cor.test(x = frec_comun$freq_1997, y = frec_comun$freq_2001)
## 
##  Pearson's product-moment correlation
## 
## data:  frec_comun$freq_1997 and frec_comun$freq_2001
## t = 20.589, df = 3477, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2997067 0.3589539
## sample estimates:
##       cor 
## 0.3296548

#7 Analisis de sentimiento

# diccionarios
# no hay diccionarios en español disponibles en tidytext
# https://www.kaggle.com/datasets/rtatman/sentiment-lexicons-for-81-languages
positive_words <- read_csv("positive_words_es.txt", col_names = "word", show_col_types = FALSE) %>%
  mutate(sentiment = "Positivo")
negative_words <- read_csv("negative_words_es.txt", col_names = "word", show_col_types = FALSE) %>%
  mutate(sentiment = "Negativo")
sentiment_words <- bind_rows(positive_words, negative_words)
# comparacion de diccionarios
get_sentiments("bing") %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
sentiment_words %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 Negativo   2720
## 2 Positivo   1555
###### viz
suppressMessages(suppressWarnings(library(RColorBrewer)))

# ---------- petro ----------
p1 <- text_1997 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  slice_max(order_by = abs(n), n = 20) %>%  # Mostrar las 50 palabras más frecuentes
  mutate(n = ifelse(sentiment == "Negativo", -n, n)) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
    geom_col() +
    scale_fill_manual(values = brewer.pal(8, 'Dark2')[c(2, 5)]) +
    coord_flip(ylim = c(-7, 7)) +  # Ajusta los límites si es necesario
    labs(y = "Frecuencia",
         x = NULL,
         title = "1997: Conteo por sentimiento") +
    theme_minimal()
## Joining with `by = join_by(word)`
##### viz
library(RColorBrewer) # Asegúrate de cargar RColorBrewer si no está cargado

p2 <- text_2001 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  slice_max(order_by = abs(n), n = 20) %>%  # Seleccionar las 50 palabras más frecuentes
  mutate(n = ifelse(sentiment == "Negativo", -n, n)) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
    geom_col() +
    scale_fill_manual(values = brewer.pal(8,'Dark2')[c(2,5)]) +
    coord_flip(ylim = c(-7,7)) +  # Ajustar los límites si es necesario
    labs(y = "Frecuencia",
         x = NULL,
         title = "2001: Conteo por sentimiento") +
    theme_minimal()
## Joining with `by = join_by(word)`
grid.arrange(p1, p2, ncol = 2)

text_2005 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  slice_max(order_by = abs(n), n = 20) %>%  # Mostrar las 50 palabras más frecuentes
  mutate(n = ifelse(sentiment == "Negativo", -n, n)) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
    geom_col() +
    scale_fill_manual(values = brewer.pal(8, 'Dark2')[c(2, 5)]) +
    coord_flip(ylim = c(-7, 7)) +  # Ajusta los límites si es necesario
    labs(y = "Frecuencia",
         x = NULL,
         title = "2005: Conteo por sentimiento") +
    theme_minimal() -> p2
## Joining with `by = join_by(word)`
grid.arrange(p1, p2, ncol = 2)

text_2008 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  slice_max(order_by = abs(n), n = 20) %>%  # Mostrar las 50 palabras más frecuentes
  mutate(n = ifelse(sentiment == "Negativo", -n, n)) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
    geom_col() +
    scale_fill_manual(values = brewer.pal(8, 'Dark2')[c(2, 5)]) +
    coord_flip(ylim = c(-7, 7)) +  # Ajusta los límites si es necesario
    labs(y = "Frecuencia",
         x = NULL,
         title = "2008: Conteo por sentimiento") +
    theme_minimal() -> p2
## Joining with `by = join_by(word)`
grid.arrange(p1, p2, ncol = 2)

text_2010 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  slice_max(order_by = abs(n), n = 20) %>%  # Mostrar las 50 palabras más frecuentes
  mutate(n = ifelse(sentiment == "Negativo", -n, n)) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
    geom_col() +
    scale_fill_manual(values = brewer.pal(8, 'Dark2')[c(2, 5)]) +
    coord_flip(ylim = c(-7, 7)) +  # Ajusta los límites si es necesario
    labs(y = "Frecuencia",
         x = NULL,
         title = "2010: Conteo por sentimiento") +
    theme_minimal() -> p2
## Joining with `by = join_by(word)`
grid.arrange(p1, p2, ncol = 2)

suppressMessages(suppressWarnings(library(reshape2)))  # acast
##### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

set.seed(123)
text_1997 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problema could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## usado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## acuerdo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## errores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## importante could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## mejores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## superior could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## basura could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## imaginar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## pequeño could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perder could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## correcto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## firmemente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## fortaleza could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## golpes could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## horrible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## intereses could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## invasivo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## licenciar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## muerto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perdido could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## piedra could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## preocupado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## probado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problemas could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## triste could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## ventaja could not be fit on page. It will not be plotted.
title(main = "1997")

set.seed(123)
text_2001 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## interrupciones could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## pequeño could not be fit on page. It will not be plotted.
title(main = "2001")

suppressMessages(suppressWarnings(library(reshape2)))  # acast
##### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

set.seed(123)
text_1997 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problema could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## usado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## acuerdo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## errores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## importante could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## mejores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## superior could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## basura could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## imaginar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## pequeño could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perder could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## correcto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## firmemente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## fortaleza could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## golpes could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## horrible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## intereses could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## invasivo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## licenciar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## muerto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perdido could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## piedra could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## preocupado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## probado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problemas could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## triste could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## ventaja could not be fit on page. It will not be plotted.
title(main = "1997")
# ----------  ----------
set.seed(123)
text_2005 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## particularmente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## separar could not be fit on page. It will not be plotted.
title(main = "2005")

suppressMessages(suppressWarnings(library(reshape2)))  # acast
##### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))
# ----------  ----------
set.seed(123)
text_1997 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problema could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## usado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## acuerdo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## errores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## importante could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## mejores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## superior could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## basura could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## imaginar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## pequeño could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perder could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## correcto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## firmemente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## fortaleza could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## golpes could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## horrible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## intereses could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## invasivo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## licenciar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## muerto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perdido could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## piedra could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## preocupado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## probado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problemas could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## triste could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## ventaja could not be fit on page. It will not be plotted.
title(main = "1997")
# ----------  ----------
set.seed(123)
text_2008 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], : nube
## could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## gracias could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## absolutamente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## estudiante could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problemas could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## superior could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## actualizado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## capacidad could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## completamente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## equivocado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## pequeño could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## persistente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problema could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## recibir could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## usado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## asequible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## excelente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## fenomenal could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## inmediatamente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## interesante could not be fit on page. It will not be plotted.
title(main = "2008")

suppressMessages(suppressWarnings(library(reshape2)))  # acast
##### viz
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))
# ----------  ----------
set.seed(123)
text_1997 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problema could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## usado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## acuerdo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## errores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## importante could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## mejores could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## superior could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## basura could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## imaginar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## pequeño could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perder could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## correcto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## firmemente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## fortaleza could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## golpes could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## horrible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## intereses could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## invasivo could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## licenciar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## muerto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## perdido could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## piedra could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## preocupado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## probado could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## problemas could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## triste could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## ventaja could not be fit on page. It will not be plotted.
title(main = "1997")
# ----------  ----------
set.seed(123)
text_2010 %>%
  inner_join(sentiment_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = brewer.pal(8,'Dark2')[c(2,5)], 
                   max.words = 50, title.size = 1.5)
## Joining with `by = join_by(word)`
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## gravedad could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## resistencia could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## sorprendente could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## explotar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## imaginar could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## opuesto could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## papelera could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = brewer.pal(8, "Dark2")[c(2, 5)], :
## parecer could not be fit on page. It will not be plotted.
title(main = "2010")

8

text_1997 <- unlist(c(read_csv("AppleWWDC1997_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2001 <- unlist(c(read_csv("AppleWWDC2001_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2005 <- unlist(c(read_csv("AppleWWDC2005_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2008 <- unlist(c(read_csv("AppleWWDC2008_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2010 <- unlist(c(read_csv("AppleWWDC2010_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
names(text_1997) <- NULL
text_1997 <- tibble(line = 1:length(text_1997), text = text_1997)
text_1997 %>%
  unnest_tokens(tbl = ., input = text, output = bigram, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) -> text_1997_bi  # importante!
dim(text_1997_bi)
## [1] 10362     2
names(text_2001) <- NULL
text_2001 <- tibble(line = 1:length(text_2001), text = text_2001)
text_2001 %>%
  unnest_tokens(tbl = ., input = text, output = bigram, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) -> text_2001_bi  # importante!
dim(text_2001_bi)
## [1] 13537     2
names(text_2005) <- NULL
text_2005 <- tibble(line = 1:length(text_2005), text = text_2005)
text_2005 %>%
  unnest_tokens(tbl = ., input = text, output = bigram, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) -> text_2005_bi  # importante!
dim(text_2005_bi)
## [1] 7594    2
names(text_2008) <- NULL
text_2008 <- tibble(line = 1:length(text_2008), text = text_2008)
text_2008 %>%
  unnest_tokens(tbl = ., input = text, output = bigram, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) -> text_2008_bi  # importante!
dim(text_2008_bi)
## [1] 13615     2
names(text_2010) <- NULL
text_2010 <- tibble(line = 1:length(text_2010), text = text_2010)
text_2010 %>%
  unnest_tokens(tbl = ., input = text, output = bigram, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) -> text_2010_bi  # importante!
dim(text_2010_bi)
## [1] 5955    2
head(text_1997_bi, n = 10)
## # A tibble: 10 × 2
##     line bigram          
##    <int> <chr>           
##  1     1 buenos días     
##  2     2 ambos llevaban  
##  3     2 llevaban corbata
##  4     2 corbata toda    
##  5     2 toda la         
##  6     2 la semana       
##  7     5 corto así       
##  8     5 así que         
##  9     5 que lo          
## 10     5 lo haré
head(text_2005_bi, n = 10)
## # A tibble: 10 × 2
##     line bigram              
##    <int> <chr>               
##  1     1 bienvenidos a       
##  2     1 a nuestra           
##  3     1 nuestra conferencia 
##  4     1 conferencia mundial 
##  5     1 mundial de          
##  6     1 de desarrolladores  
##  7     1 desarrolladores 2005
##  8     1 2005 hoy            
##  9     1 hoy es              
## 10     1 es un
head(text_2008_bi, n = 10)
## # A tibble: 10 × 2
##     line bigram      
##    <int> <chr>       
##  1     1 estoy muy   
##  2     1 muy contento
##  3     1 contento de 
##  4     1 de estar    
##  5     1 estar aquí  
##  6     1 aquí esta   
##  7     1 esta vez    
##  8     2 buenos días 
##  9     2 días hemos  
## 10     2 hemos estado
head(text_2010_bi, n = 10)
## # A tibble: 10 × 2
##     line bigram          
##    <int> <chr>           
##  1     1 así que         
##  2     1 que volvamos    
##  3     1 volvamos al     
##  4     1 al iphone       
##  5     2 en 2007         
##  6     2 2007 el         
##  7     2 el iphone       
##  8     2 iphone reinventó
##  9     2 reinventó lo    
## 10     2 lo que
###### top 10 de bigramas mas frecuentes
# hay bigramas que no son interesantes (e.g., "de la")
# esto motiva el uso de stop words nuevamente
text_1997_bi %>%
  count(bigram, sort = TRUE) %>%
  head(n = 10)
## # A tibble: 10 × 2
##    bigram        n
##    <chr>     <int>
##  1 creo que     94
##  2 lo que       70
##  3 así que      35
##  4 que apple    34
##  5 y creo       33
##  6 en el        32
##  7 que no       32
##  8 de las       30
##  9 ya sabes     30
## 10 en la        28
###### top 10 de bigramas mas frecuentes
# hay bigramas que no son interesantes (e.g., "de la")
# esto motiva el uso de stop words nuevamente
text_2001_bi %>%
  count(bigram, sort = TRUE) %>%
  head(n = 10)
## # A tibble: 10 × 2
##    bigram       n
##    <chr>    <int>
##  1 lo que      85
##  2 así que     81
##  3 os 10       69
##  4 en el       66
##  5 de la       53
##  6 mac os      52
##  7 en la       39
##  8 es un       37
##  9 para que    31
## 10 es una      27
###### top 10 de bigramas mas frecuentes
# hay bigramas que no son interesantes (e.g., "de la")
# esto motiva el uso de stop words nuevamente
text_2005_bi %>%
  count(bigram, sort = TRUE) %>%
  head(n = 10)
## # A tibble: 10 × 2
##    bigram       n
##    <chr>    <int>
##  1 lo que      44
##  2 así que     41
##  3 en el       33
##  4 vamos a     23
##  5 ya sabes    21
##  6 más de      20
##  7 os 10       20
##  8 que es      19
##  9 de apple    18
## 10 de la       18
###### top 10 de bigramas mas frecuentes
# hay bigramas que no son interesantes (e.g., "de la")
# esto motiva el uso de stop words nuevamente
text_2008_bi %>%
  count(bigram, sort = TRUE) %>%
  head(n = 10)
## # A tibble: 10 × 2
##    bigram            n
##    <chr>         <int>
##  1 en el            89
##  2 así que          76
##  3 el iphone        71
##  4 lo que           69
##  5 en la            61
##  6 de la            43
##  7 para que         39
##  8 con el           29
##  9 voy a            28
## 10 la aplicación    26
###### top 10 de bigramas mas frecuentes
# hay bigramas que no son interesantes (e.g., "de la")
# esto motiva el uso de stop words nuevamente
text_2010_bi %>%
  count(bigram, sort = TRUE) %>%
  head(n = 10)
## # A tibble: 10 × 2
##    bigram         n
##    <chr>      <int>
##  1 lo que        40
##  2 iphone 4      38
##  3 el iphone     36
##  4 así que       33
##  5 en el         22
##  6 en la         21
##  7 de la         19
##  8 adelante y    17
##  9 para que      17
## 10 es el         15
text_1997_bi %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_1997_bi_counts  # importante para la conformacion de la red!
dim(text_1997_bi_counts)
## [1] 743   3
text_2001_bi %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2001_bi_counts  # importante para la conformacion de la red!
dim(text_2001_bi_counts)
## [1] 1171    3
text_2005_bi %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2005_bi_counts  # importante para la conformacion de la red!
dim(text_2005_bi_counts)
## [1] 588   3
text_2008_bi %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2008_bi_counts  # importante para la conformacion de la red!
dim(text_2008_bi_counts)
## [1] 1188    3
text_2010_bi %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2010_bi_counts  # importante para la conformacion de la red!
dim(text_2010_bi_counts)
## [1] 508   3
head(text_1997_bi_counts, n = 10)
## # A tibble: 10 × 3
##    word1     word2       weight
##    <chr>     <chr>        <int>
##  1 rap       city             8
##  2 correo    electronico      7
##  3 street    journal          6
##  4 wall      street           6
##  5 productos realmente        5
##  6 apple     deberia          4
##  7 disco     duro             4
##  8 pequeña   cosa             4
##  9 and       play             3
## 10 apple     necesita         3
head(text_2001_bi_counts, n = 10)
## # A tibble: 10 × 3
##    word1   word2     weight
##    <chr>   <chr>      <int>
##  1 mac     os            52
##  2 sistema operativo      9
##  3 super   drive          9
##  4 power   mac            8
##  5 gracias steve          7
##  6 disco   duro           6
##  7 power   max            6
##  8 centro  comercial      5
##  9 grabar  dvd            5
## 10 libro   mundial        5
head(text_2005_bi_counts, n = 10)
## # A tibble: 10 × 3
##    word1        word2       weight
##    <chr>        <chr>        <int>
##  1 sistema      operativo       12
##  2 mac          os              10
##  3 binarios     universales      7
##  4 has          visto            7
##  5 codigo       fuente           6
##  6 procesadores intel            6
##  7 dejame       abrir            5
##  8 excelentes   productos        5
##  9 dejame       mostrarte        4
## 10 wolfram      research         4
head(text_2008_bi_counts, n = 10)
## # A tibble: 10 × 3
##    word1       word2        weight
##    <chr>       <chr>         <int>
##  1 correo      electronico      25
##  2 gustaria    invitar          12
##  3 app         store            11
##  4 mis         contactos         8
##  5 sitio       web               6
##  6 correos     electronicos      5
##  7 dispositivo movil             5
##  8 interfaz    web               5
##  9 mac         os                5
## 10 tu          iphone            5
head(text_2010_bi_counts, n = 10)
## # A tibble: 10 × 3
##    word1        word2      weight
##    <chr>        <chr>       <int>
##  1 pantalla     retina         14
##  2 acero        inoxidable      7
##  3 wi           fi              7
##  4 estaciones   base            5
##  5 tu           telefono        5
##  6 alta         definicion      4
##  7 flash        led             4
##  8 computadoras portatiles      3
##  9 has          visto           3
## 10 iphone       original        3
##### definir una red a partir de la frecuencia (weight) de los bigramas
# binaria, no dirigida, ponderada, simple
# se recomienda variar el umbral del filtro y construir bigramas no consecutivos para obtener redes con mayor informacion
suppressMessages(suppressWarnings(library(igraph)))

# Crear la red
g <- text_1997_bi_counts %>%
  filter(weight > 2) %>%
  graph_from_data_frame(directed = FALSE)

# Ajuste visual
set.seed(123)
plot(g, 
     layout = layout_with_kk,  # Layout con más dispersión
     vertex.color = 1, 
     vertex.frame.color = 1, 
     vertex.size = 6,  # Tamaño de los vértices mayor para mayor separación
     vertex.label.color = 'black', 
     vertex.label.cex = 0.6,  # Reducir un poco el tamaño de la fuente
     vertex.label.dist = 3,  # Aumenté la distancia entre las etiquetas y los vértices
     main = "1997 - Umbral = 2")

##### definir una red a partir de la frecuencia (weight) de los bigramas
# binaria, no dirigida, ponderada, simple
# se recomienda variar el umbral del filtro y construir bigramas no consecutivos para obtener redes con mayor informacion
suppressMessages(suppressWarnings(library(igraph)))

# Crear la red
g <- text_2001_bi_counts %>%
  filter(weight > 3) %>%
  graph_from_data_frame(directed = FALSE)

# Ajuste visual
set.seed(123)
plot(g, 
     layout = layout_with_kk,  # Layout con más dispersión
     vertex.color = 1, 
     vertex.frame.color = 1, 
     vertex.size = 6,  # Tamaño de los vértices mayor para mayor separación
     vertex.label.color = 'black', 
     vertex.label.cex = 0.6,  # Reducir un poco el tamaño de la fuente
     vertex.label.dist = 3,  # Aumenté la distancia entre las etiquetas y los vértices
     main = "2001 - Umbral = 3")

##### definir una red a partir de la frecuencia (weight) de los bigramas
# binaria, no dirigida, ponderada, simple
# se recomienda variar el umbral del filtro y construir bigramas no consecutivos para obtener redes con mayor informacion
suppressMessages(suppressWarnings(library(igraph)))

# Crear la red
g <- text_2005_bi_counts %>%
  filter(weight > 4) %>%
  graph_from_data_frame(directed = FALSE)

# Ajuste visual
set.seed(123)
plot(g, 
     layout = layout_with_kk,  # Layout con más dispersión
     vertex.color = 1, 
     vertex.frame.color = 1, 
     vertex.size = 6,  # Tamaño de los vértices mayor para mayor separación
     vertex.label.color = 'black', 
     vertex.label.cex = 0.6,  # Reducir un poco el tamaño de la fuente
     vertex.label.dist = 3,  # Aumenté la distancia entre las etiquetas y los vértices
     main = "2005 - Umbral = 4")

##### definir una red a partir de la frecuencia (weight) de los bigramas
# binaria, no dirigida, ponderada, simple
# se recomienda variar el umbral del filtro y construir bigramas no consecutivos para obtener redes con mayor informacion
suppressMessages(suppressWarnings(library(igraph)))

# Crear la red
g <- text_2008_bi_counts %>%
  filter(weight > 3) %>%
  graph_from_data_frame(directed = FALSE)

# Ajuste visual
set.seed(123)
plot(g, 
     layout = layout_with_kk,  # Layout con más dispersión
     vertex.color = 1, 
     vertex.frame.color = 1, 
     vertex.size = 6,  # Tamaño de los vértices mayor para mayor separación
     vertex.label.color = 'black', 
     vertex.label.cex = 0.6,  # Reducir un poco el tamaño de la fuente
     vertex.label.dist = 3,  # Aumenté la distancia entre las etiquetas y los vértices
     main = "2008 - Umbral = 3")

##### definir una red a partir de la frecuencia (weight) de los bigramas
# binaria, no dirigida, ponderada, simple
# se recomienda variar el umbral del filtro y construir bigramas no consecutivos para obtener redes con mayor informacion
suppressMessages(suppressWarnings(library(igraph)))

# Crear la red
g <- text_2010_bi_counts %>%
  filter(weight > 2) %>%
  graph_from_data_frame(directed = FALSE)

# Ajuste visual
set.seed(123)
plot(g, 
     layout = layout_with_kk,  # Layout con más dispersión
     vertex.color = 1, 
     vertex.frame.color = 1, 
     vertex.size = 6,  # Tamaño de los vértices mayor para mayor separación
     vertex.label.color = 'black', 
     vertex.label.cex = 0.6,  # Reducir un poco el tamaño de la fuente
     vertex.label.dist = 3,  # Aumenté la distancia entre las etiquetas y los vértices
     main = "2010 - Umbral = 2")

##### red con un umbral diferente
g <- text_1997_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# viz
set.seed(123)
plot(g, layout = layout_with_kk, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA, main = "1997 - Umbral = 1")

##### red con un umbral diferente
g <- text_2001_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# viz
set.seed(123)
plot(g, layout = layout_with_kk, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA, main = "2001 - Umbral = 1")

##### red con un umbral diferente
g <- text_2005_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# viz
set.seed(123)
plot(g, layout = layout_with_kk, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA, main = "2005 - Umbral = 1")

##### red con un umbral diferente
g <- text_2008_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# viz
set.seed(123)
plot(g, layout = layout_with_kk, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA, main = "2008 - Umbral = 1")

##### red con un umbral diferente
g <- text_2010_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# viz
set.seed(123)
plot(g, layout = layout_with_kk, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA, main = "2010 - Umbral = 1")

##### componente conexa mas grande de la red
g <- text_1997_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
## Warning: `clusters()` was deprecated in igraph 2.0.0.
## ℹ Please use `components()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

set.seed(123)
plot(gcc, layout = layout_with_kk, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label.color = 'black', vertex.label.cex = 0.9, vertex.label.dist = 1, edge.width = 3*E(g)$weight/max(E(g)$weight))
title(main = "Componente conexa", outer = T, line = -1)

##### componente conexa mas grande de la red
g <- text_2001_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

# viz 2
set.seed(123)
plot(gcc, layout = layout_with_kk, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label.color = 'black', vertex.label.cex = 0.9, vertex.label.dist = 1, edge.width = 3*E(g)$weight/max(E(g)$weight))
title(main = "Componente conexa", outer = T, line = -1)

##### componente conexa mas grande de la red
g <- text_2005_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

# viz 2
set.seed(123)
plot(gcc, layout = layout_with_kk, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label.color = 'black', vertex.label.cex = 0.9, vertex.label.dist = 1, edge.width = 3*E(g)$weight/max(E(g)$weight))
title(main = "Componente conexa", outer = T, line = -1)

##### componente conexa mas grande de la red
g <- text_2008_bi_counts %>%
  filter(weight > 2) %>%
  graph_from_data_frame(directed = FALSE)
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

# viz 2
set.seed(123)
plot(gcc, layout = layout_with_kk, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label.color = 'black', vertex.label.cex = 0.9, vertex.label.dist = 1, edge.width = 3*E(g)$weight/max(E(g)$weight))
title(main = "Componente conexa", outer = T, line = -1)

##### componente conexa mas grande de la red
g <- text_2010_bi_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))

# viz 2
set.seed(123)
plot(gcc, layout = layout_with_kk, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label.color = 'black', vertex.label.cex = 0.9, vertex.label.dist = 1, edge.width = 3*E(g)$weight/max(E(g)$weight))
title(main = "Componente conexa", outer = T, line = -1)

9

##### importar datos
text_1997 <- unlist(c(read_csv("AppleWWDC1997_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2001 <- unlist(c(read_csv("AppleWWDC2001_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2005 <- unlist(c(read_csv("AppleWWDC2005_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2008 <- unlist(c(read_csv("AppleWWDC2008_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
text_2010 <- unlist(c(read_csv("AppleWWDC2010_es.txt", col_names = FALSE, show_col_types = FALSE)))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
names(text_1997) <- NULL
text_1997 <- tibble(line = 1:length(text_1997), text = text_1997)
names(text_2001) <- NULL
text_2001 <- tibble(line = 1:length(text_2001), text = text_2001)
names(text_2005) <- NULL
text_2005 <- tibble(line = 1:length(text_2005), text = text_2005)
names(text_2008) <- NULL
text_2008 <- tibble(line = 1:length(text_2008), text = text_2008)
names(text_2010) <- NULL
text_2010 <- tibble(line = 1:length(text_2010), text = text_2010)
##### tokenizar en skip-gram
# en este caso cada token es un unigrama o un bigrama regular o un bigrama con espaciamiento
text_1997 %>%
  unnest_tokens(tbl = ., input = text, output = skipgram, token = "skip_ngrams", n = 2) %>%
  filter(!is.na(skipgram)) -> text_1997_skip
dim(text_1997_skip)
## [1] 31144     2
text_2001 %>%
  unnest_tokens(tbl = ., input = text, output = skipgram, token = "skip_ngrams", n = 2) %>%
  filter(!is.na(skipgram)) -> text_2001_skip
dim(text_2001_skip)
## [1] 40728     2
text_2005 %>%
  unnest_tokens(tbl = ., input = text, output = skipgram, token = "skip_ngrams", n = 2) %>%
  filter(!is.na(skipgram)) -> text_2005_skip
dim(text_2005_skip)
## [1] 22792     2
text_2008 %>%
  unnest_tokens(tbl = ., input = text, output = skipgram, token = "skip_ngrams", n = 2) %>%
  filter(!is.na(skipgram)) -> text_2008_skip
dim(text_2008_skip)
## [1] 40861     2
text_2010 %>%
  unnest_tokens(tbl = ., input = text, output = skipgram, token = "skip_ngrams", n = 2) %>%
  filter(!is.na(skipgram)) -> text_2010_skip
dim(text_2010_skip)
## [1] 17871     2
head(text_1997_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram        
##    <int> <chr>           
##  1     1 buenos          
##  2     1 buenos días     
##  3     1 días            
##  4     2 ambos           
##  5     2 ambos llevaban  
##  6     2 ambos corbata   
##  7     2 llevaban        
##  8     2 llevaban corbata
##  9     2 llevaban toda   
## 10     2 corbata
head(text_2001_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram         
##    <int> <chr>            
##  1     1 buenos           
##  2     1 buenos días      
##  3     1 días             
##  4     2 estamos          
##  5     2 estamos muy      
##  6     2 estamos contentos
##  7     2 muy              
##  8     2 muy contentos    
##  9     2 muy de           
## 10     2 contentos
head(text_2005_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram           
##    <int> <chr>              
##  1     1 bienvenidos        
##  2     1 bienvenidos a      
##  3     1 bienvenidos nuestra
##  4     1 a                  
##  5     1 a nuestra          
##  6     1 a conferencia      
##  7     1 nuestra            
##  8     1 nuestra conferencia
##  9     1 nuestra mundial    
## 10     1 conferencia
head(text_2008_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram      
##    <int> <chr>         
##  1     1 estoy         
##  2     1 estoy muy     
##  3     1 estoy contento
##  4     1 muy           
##  5     1 muy contento  
##  6     1 muy de        
##  7     1 contento      
##  8     1 contento de   
##  9     1 contento estar
## 10     1 de
head(text_2010_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram       
##    <int> <chr>          
##  1     1 así            
##  2     1 así que        
##  3     1 así volvamos   
##  4     1 que            
##  5     1 que volvamos   
##  6     1 que al         
##  7     1 volvamos       
##  8     1 volvamos al    
##  9     1 volvamos iphone
## 10     1 al
suppressMessages(suppressWarnings(library(ngram)))
# contar palabras en cada skip-gram
text_1997_skip$num_words <- text_1997_skip$skipgram %>% 
  map_int(.f = ~ wordcount(.x))
head(text_1997_skip, n = 10)
## # A tibble: 10 × 3
##     line skipgram         num_words
##    <int> <chr>                <int>
##  1     1 buenos                   1
##  2     1 buenos días              2
##  3     1 días                     1
##  4     2 ambos                    1
##  5     2 ambos llevaban           2
##  6     2 ambos corbata            2
##  7     2 llevaban                 1
##  8     2 llevaban corbata         2
##  9     2 llevaban toda            2
## 10     2 corbata                  1
# remover unigramas
text_1997_skip %<>% 
  filter(num_words == 2) %>% 
  select(-num_words)
dim(text_1997_skip)
## [1] 19788     2
suppressMessages(suppressWarnings(library(ngram)))
# contar palabras en cada skip-gram
text_2001_skip$num_words1 <- text_2001_skip$skipgram %>% 
  map_int(.f = ~ wordcount(.x))
head(text_2001_skip, n = 10)
## # A tibble: 10 × 3
##     line skipgram          num_words1
##    <int> <chr>                  <int>
##  1     1 buenos                     1
##  2     1 buenos días                2
##  3     1 días                       1
##  4     2 estamos                    1
##  5     2 estamos muy                2
##  6     2 estamos contentos          2
##  7     2 muy                        1
##  8     2 muy contentos              2
##  9     2 muy de                     2
## 10     2 contentos                  1
text_2001_skip %<>% 
  filter(num_words1 == 2) %>% 
  select(-num_words1)
dim(text_2001_skip)
## [1] 25629     2
suppressMessages(suppressWarnings(library(ngram)))
# contar palabras en cada skip-gram
text_2005_skip$num_words2 <- text_2005_skip$skipgram %>% 
  map_int(.f = ~ wordcount(.x))
head(text_2005_skip, n = 10)
## # A tibble: 10 × 3
##     line skipgram            num_words2
##    <int> <chr>                    <int>
##  1     1 bienvenidos                  1
##  2     1 bienvenidos a                2
##  3     1 bienvenidos nuestra          2
##  4     1 a                            1
##  5     1 a nuestra                    2
##  6     1 a conferencia                2
##  7     1 nuestra                      1
##  8     1 nuestra conferencia          2
##  9     1 nuestra mundial              2
## 10     1 conferencia                  1
text_2005_skip %<>% 
  filter(num_words2 == 2) %>% 
  select(-num_words2)
dim(text_2005_skip)
## [1] 14690     2
suppressMessages(suppressWarnings(library(ngram)))
# contar palabras en cada skip-gram
text_2008_skip$num_words3 <- text_2008_skip$skipgram %>% 
  map_int(.f = ~ wordcount(.x))
head(text_2008_skip, n = 10)
## # A tibble: 10 × 3
##     line skipgram       num_words3
##    <int> <chr>               <int>
##  1     1 estoy                   1
##  2     1 estoy muy               2
##  3     1 estoy contento          2
##  4     1 muy                     1
##  5     1 muy contento            2
##  6     1 muy de                  2
##  7     1 contento                1
##  8     1 contento de             2
##  9     1 contento estar          2
## 10     1 de                      1
text_2008_skip %<>% 
  filter(num_words3 == 2) %>% 
  select(-num_words3)
dim(text_2008_skip)
## [1] 26346     2
suppressMessages(suppressWarnings(library(ngram)))
# contar palabras en cada skip-gram
text_2010_skip$num_words4 <- text_2010_skip$skipgram %>% 
  map_int(.f = ~ wordcount(.x))
head(text_2010_skip, n = 10)
## # A tibble: 10 × 3
##     line skipgram        num_words4
##    <int> <chr>                <int>
##  1     1 así                      1
##  2     1 así que                  2
##  3     1 así volvamos             2
##  4     1 que                      1
##  5     1 que volvamos             2
##  6     1 que al                   2
##  7     1 volvamos                 1
##  8     1 volvamos al              2
##  9     1 volvamos iphone          2
## 10     1 al                       1
text_2010_skip %<>% 
  filter(num_words4 == 2) %>% 
  select(-num_words4)
dim(text_2010_skip)
## [1] 11459     2
head(text_1997_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram        
##    <int> <chr>           
##  1     1 buenos días     
##  2     2 ambos llevaban  
##  3     2 ambos corbata   
##  4     2 llevaban corbata
##  5     2 llevaban toda   
##  6     2 corbata toda    
##  7     2 corbata la      
##  8     2 toda la         
##  9     2 toda semana     
## 10     2 la semana
head(text_2001_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram         
##    <int> <chr>            
##  1     1 buenos días      
##  2     2 estamos muy      
##  3     2 estamos contentos
##  4     2 muy contentos    
##  5     2 muy de           
##  6     2 contentos de     
##  7     2 contentos estar  
##  8     2 de estar         
##  9     2 de aquí          
## 10     2 estar aquí
head(text_2005_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram               
##    <int> <chr>                  
##  1     1 bienvenidos a          
##  2     1 bienvenidos nuestra    
##  3     1 a nuestra              
##  4     1 a conferencia          
##  5     1 nuestra conferencia    
##  6     1 nuestra mundial        
##  7     1 conferencia mundial    
##  8     1 conferencia de         
##  9     1 mundial de             
## 10     1 mundial desarrolladores
head(text_2008_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram      
##    <int> <chr>         
##  1     1 estoy muy     
##  2     1 estoy contento
##  3     1 muy contento  
##  4     1 muy de        
##  5     1 contento de   
##  6     1 contento estar
##  7     1 de estar      
##  8     1 de aquí       
##  9     1 estar aquí    
## 10     1 estar esta
head(text_2010_skip, n = 10)
## # A tibble: 10 × 2
##     line skipgram       
##    <int> <chr>          
##  1     1 así que        
##  2     1 así volvamos   
##  3     1 que volvamos   
##  4     1 que al         
##  5     1 volvamos al    
##  6     1 volvamos iphone
##  7     1 al iphone      
##  8     2 en 2007        
##  9     2 en el          
## 10     2 2007 el
##### omitir stop words
text_1997_skip %>%
  separate(skipgram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_1997_skip_counts
dim(text_1997_skip_counts)
## [1] 1852    3
text_2001_skip %>%
  separate(skipgram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2001_skip_counts
dim(text_2001_skip_counts)
## [1] 2901    3
text_2005_skip %>%
  separate(skipgram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2005_skip_counts
dim(text_2005_skip_counts)
## [1] 1580    3
text_2008_skip %>%
  separate(skipgram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2008_skip_counts
dim(text_2008_skip_counts)
## [1] 3065    3
text_2010_skip %>%
  separate(skipgram, c("word1", "word2"), sep = " ") %>%
  filter(!grepl(pattern = '[0-9]', x = word1)) %>%
  filter(!grepl(pattern = '[0-9]', x = word2)) %>%
  filter(!word1 %in% stop_words_es$word) %>%
  filter(!word2 %in% stop_words_es$word) %>%
  mutate(word1 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word1)) %>%
  mutate(word2 = chartr(old = names(replacement_list) %>% str_c(collapse = ''), 
                       new = replacement_list %>% str_c(collapse = ''),
                       x = word2)) %>%
  filter(!is.na(word1)) %>% 
  filter(!is.na(word2)) %>%
  count(word1, word2, sort = TRUE) %>%
  rename(weight = n) -> text_2010_skip_counts
dim(text_2010_skip_counts)
## [1] 1219    3
head(text_1997_skip_counts, n = 10)
## # A tibble: 10 × 3
##    word1     word2       weight
##    <chr>     <chr>        <int>
##  1 rap       city             8
##  2 correo    electronico      7
##  3 punto     vista            7
##  4 street    journal          6
##  5 wall      journal          6
##  6 wall      street           6
##  7 apple     deberia          5
##  8 creadores clones           5
##  9 hardware  apple            5
## 10 productos realmente        5
head(text_2001_skip_counts, n = 10)
## # A tibble: 10 × 3
##    word1      word2     weight
##    <chr>      <chr>      <int>
##  1 mac        os            52
##  2 sistema    operativo      9
##  3 super      drive          9
##  4 power      mac            8
##  5 gracias    steve          7
##  6 disco      duro           6
##  7 disponible mac            6
##  8 etapas     pipeline       6
##  9 power      max            6
## 10 centro     comercial      5
head(text_2005_skip_counts, n = 10)
## # A tibble: 10 × 3
##    word1        word2       weight
##    <chr>        <chr>        <int>
##  1 sistema      operativo       12
##  2 mac          os              10
##  3 binarios     universales      7
##  4 has          visto            7
##  5 codigo       fuente           6
##  6 procesadores intel            6
##  7 año          viene            5
##  8 dejame       abrir            5
##  9 excelentes   productos        5
## 10 powerpc      intel            5
head(text_2008_skip_counts, n = 10)
## # A tibble: 10 × 3
##    word1        word2        weight
##    <chr>        <chr>         <int>
##  1 correo       electronico      25
##  2 gustaria     invitar          12
##  3 software     iphone           12
##  4 app          store            11
##  5 mis          contactos         8
##  6 barra        herramientas      6
##  7 directamente tu                6
##  8 interfaz     usuario           6
##  9 sdk          iphone            6
## 10 sitio        web               6
head(text_2010_skip_counts, n = 10)
## # A tibble: 10 × 3
##    word1        word2      weight
##    <chr>        <chr>       <int>
##  1 pantalla     retina         14
##  2 acero        inoxidable      7
##  3 wi           fi              7
##  4 estaciones   base            5
##  5 pixeles      pulgada         5
##  6 tu           telefono        5
##  7 alta         definicion      4
##  8 directamente tu              4
##  9 flash        led             4
## 10 imovie       iphone          4
##### definir una red a partir de la frecuencia (weight) de los bigramas
g <- text_1997_skip_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
g <- igraph::simplify(g)  # importante!
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))
# viz 1
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA)
# viz 2
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label = NA)
title(main = "Componente conexa", outer = T, line = -1)

##### definir una red a partir de la frecuencia (weight) de los bigramas
g <- text_2001_skip_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
g <- igraph::simplify(g)  # importante!
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))
# viz 1
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA)
# viz 2
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label = NA)
title(main = "Componente conexa", outer = T, line = -1)

##### definir una red a partir de la frecuencia (weight) de los bigramas
g <- text_2005_skip_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
g <- igraph::simplify(g)  # importante!
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))
# viz 1
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA)
# viz 2
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label = NA)
title(main = "Componente conexa", outer = T, line = -1)

##### definir una red a partir de la frecuencia (weight) de los bigramas
g <- text_2008_skip_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
g <- igraph::simplify(g)  # importante!
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))
# viz 1
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA)
# viz 2
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label = NA)
title(main = "Componente conexa", outer = T, line = -1)

##### definir una red a partir de la frecuencia (weight) de los bigramas
g <- text_2010_skip_counts %>%
  filter(weight > 1) %>%
  graph_from_data_frame(directed = FALSE)
g <- igraph::simplify(g)  # importante!
# grafo inducido por la componente conexa
V(g)$cluster <- clusters(graph = g)$membership
gcc <- induced_subgraph(graph = g, vids = which(V(g)$cluster == which.max(clusters(graph = g)$csize)))
par(mfrow = c(1,2), mar = c(1,1,2,1), mgp = c(1,1,1))
# viz 1
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = 1, vertex.frame.color = 1, vertex.size = 3, vertex.label = NA)
# viz 2
set.seed(123)
plot(gcc, layout = layout_with_fr, vertex.color = adjustcolor('darkolivegreen4', 0.1), vertex.frame.color = 'darkolivegreen4', vertex.size = 2*strength(gcc), vertex.label = NA)
title(main = "Componente conexa", outer = T, line = -1)